
# coding: utf-8

# In[8]:


import os
os.chdir("D:\\USPTO\BULK\\2005-2012\\2006-2012")


from xml.dom.minidom import parseString
import re

def process_single_xml_file(filename, writeon):
    xml_file = open(filename, 'r', encoding = "ISO-8859-1").read()
    xml_list = re.split('<\?xml.*?\?>', xml_file)    # 한 xml_file을 특허별로 나누어줌
    
    index = 1

    for one_xml in xml_list:

        if index > 1:

            dom = parseString(one_xml)
            B110 = dom.getElementsByTagName("publication-reference")[0]
            pid = B110.getElementsByTagName("doc-number")[0].firstChild.nodeValue
            
            pair=pid
            
            
            ipcrgroup=dom.getElementsByTagName("classifications-ipcr")
            if ipcrgroup:
                ipcrgroup=dom.getElementsByTagName("classifications-ipcr")[0]
                ipcr=ipcrgroup.getElementsByTagName("classification-ipcr")
                
                for node in ipcr:
                    
                    section=node.getElementsByTagName("section")
                    classs=node.getElementsByTagName("class")
                    #note classs, with 3 s
                    subclass=node.getElementsByTagName("subclass")
                    maingroup=node.getElementsByTagName("main-group")
                    subgroup=node.getElementsByTagName("subgroup")
                    
                    
                    if section and classs and subclass and maingroup and subgroup:
                        section=node.getElementsByTagName("section")[0].firstChild
                        classs=node.getElementsByTagName("class")[0].firstChild
                        #note classs, with 3 s
                        subclass=node.getElementsByTagName("subclass")[0].firstChild
                        maingroup=node.getElementsByTagName("main-group")[0].firstChild
                        subgroup=node.getElementsByTagName("subgroup")[0].firstChild

                    
                    
                        if section and classs and subclass and maingroup and subgroup:
                            section=node.getElementsByTagName("section")[0].firstChild.nodeValue
                            classs=node.getElementsByTagName("class")[0].firstChild.nodeValue
                            #note classs, with 3 s
                            subclass=node.getElementsByTagName("subclass")[0].firstChild.nodeValue
                            maingroup=node.getElementsByTagName("main-group")[0].firstChild.nodeValue
                            subgroup=node.getElementsByTagName("subgroup")[0].firstChild.nodeValue

                            ipc=section+classs+subclass+maingroup+subgroup
                            pair=pair+"*"+ipc
                        
            if pid[0]=='0':
                writeon.write(pair[1:]+'\n')
            if pid[0]=='1':
                writeon.write(pair[0:]+"\n")
                
        index += 1
        
        
        
        

def select_files_in_folder(dir, ext):
    for file in os.listdir(dir):    # 'dir'에 있는 모든 파일들 이름으로 list를 만듦
        if file.endswith('.%s' % ext):    # 확장자가 .ext인 파일일 경우 True
            yield os.path.join(dir, file)    # 'dir/file' 형태의 새로운 dir를 뱉어냄
            
# 'dir' 안에 확장자가 '.ext'인 모든 file들로 이루어진 generator(한번 iterate 시키고 나면 사라지는 list)를 만드는 함수

file = open("C:\\Users\\samsung\\Desktop\\ipc results\\2006_2012.csv", 'w', encoding = "utf8")

for xmlfile in select_files_in_folder("D:\\USPTO\BULK\\2005-2012\\2006-2012", 'xml') :
    print(xmlfile)
    process_single_xml_file(xmlfile, file)

print("DONE")
file.close()

